This document is a walkthrough of how to use the code for the Turnout Tracker. For a discussion of the math and the model, see Turnout Tracker Math
There are a number of parameters that need to be fit on historical data: baseline turnout rates, precinct covariances, etc.
Each election should have a config, which I’ve created in config.R. config is a list with the following items:
library(tidyverse)
source("config.R")
print(config)
## $city
## [1] "Philadelphia"
##
## $timezone
## [1] "America/New_York"
##
## $election_ds
## [1] "2018-11-06"
##
## $start_hour
## [1] 7
##
## $end_hour
## [1] 20
##
## $precinct_shp_path
## [1] "data/2016_Ward_Divisions.shp"
##
## $precinct_id_col
## [1] "WARD_DIVSN"
##
## $grouping_column
## [1] "WARD"
##
## $turnout_df_path
## [1] "data/phila_turnout.csv"
##
## $submission_bitly
## [1] "http://bit.ly/sixtysixturnout"
All of the prep work is done in calc_params. The input is a dataframe, turnout_df, which has columns precinct, year, turnout. Precinct is the unique identifier for the precinct, year is the year, and turnout is the voter count. Internally, we’ll validate that there is a turnout value for every precinct in every year, so you will need to crosswalk turnout if boundaries have changed.
df <- read.csv(config$turnout_df_path)
df$precinct <- sprintf("%04d", df$precinct)
head(arrange(df, precinct, year))
## precinct year turnout
## 1 0101 2002 185
## 2 0101 2003 213
## 3 0101 2004 311
## 4 0101 2005 62
## 5 0101 2006 188
## 6 0101 2007 133
We can now calculate the historic modelParams:
source("../calc_params.R", chdir=TRUE)
params <- calc_params(
turnout_df=df,
n_svd=3
)
## [1] "Fitting fixed effects"
## [1] "Calculating svd"
## [1] "Fitted vs True values, check for similarity:"
## [1] "Fitted:"
## [,1] [,2] [,3] [,4] [,5]
## [1,] -0.20122235 -0.22843224 -0.2050572 0.09819725 -0.094607134
## [2,] -0.22445380 -0.24449478 -0.1998690 0.05219341 -0.102150840
## [3,] -0.13947889 -0.19904323 -0.2670056 0.28741003 -0.084777551
## [4,] -0.14371751 -0.17640034 -0.2178740 0.12173056 -0.090329415
## [5,] 0.06155359 -0.00320194 -0.1387948 0.37835454 0.006629272
## [6,] -0.15458746 -0.18669677 -0.0555262 0.22357500 -0.004799110
## [,6]
## [1,] -0.01263855
## [2,] -0.03140523
## [3,] 0.06522255
## [4,] 0.03098625
## [5,] 0.12496001
## [6,] -0.05994968
## [1] "True:"
## 2002 2003 2004 2005 2006 2007
## 1 -0.13392278 -0.13233324 -0.16263111 0.05177738 -0.18586698 -0.05514649
## 2 -0.24399747 -0.19871444 -0.18207979 0.01750989 -0.05476871 -0.03905395
## 3 -0.15035112 -0.16482529 -0.26299390 0.35096655 -0.10617836 -0.07878518
## 4 -0.07960678 -0.04235591 -0.23561070 0.09393181 -0.15915686 -0.11561143
## 5 0.04707932 -0.01616247 -0.02737701 0.49386307 -0.07025798 -0.07325783
## 6 0.07079530 -0.11547255 0.03681819 0.19894611 -0.13400843 -0.20410790
## [1] "Calculating covariances"
## params has a copy of turnout_df, with some new columns.
print(head(params@turnout_df))
## precinct year turnout log_turnout precinct_num
## 1 0101 2002 185 5.225747 1
## 2 0101 2003 213 5.365976 1
## 3 0101 2004 311 5.743003 1
## 4 0101 2005 62 4.143135 1
## 5 0101 2006 188 5.241747 1
## 6 0101 2007 133 4.897840 1
## params has an estimate of the year_fe, on the log scale.
print(head(params@year_fe))
## # A tibble: 6 x 2
## year year_fe
## <int> <dbl>
## 1 2002 5.401729
## 2 2003 5.540369
## 3 2004 5.947694
## 4 2005 4.133417
## 5 2006 5.469673
## 6 2007 4.995046
## params has an estimate of the precinct_fe, on the log scale.
print(head(params@precinct_fe))
## # A tibble: 6 x 2
## precinct precinct_fe
## <fctr> <dbl>
## 1 0101 -0.0420593
## 2 0102 0.1798068
## 3 0103 0.4355977
## 4 0104 0.2385597
## 5 0105 -0.3368203
## 6 0106 -0.3250296
## params has the svd results, which is used for the covariance (more on this later).
print(head(params@svd$u))
## [,1] [,2] [,3]
## [1,] -0.02753600 -0.018012573 0.015247785
## [2,] -0.02615421 -0.021730175 0.013989446
## [3,] -0.03819729 -0.006007496 0.020654966
## [4,] -0.02899969 -0.010724228 0.009013713
## [5,] -0.02359510 0.017117010 0.017244166
## [6,] -0.01262134 -0.012197165 0.037356959
print(head(params@svd$v))
## [,1] [,2] [,3]
## [1,] 0.08179677 0.40037191 -0.08882268
## [2,] 0.13818185 0.33643029 -0.15317697
## [3,] 0.32635208 0.05219684 0.11206456
## [4,] -0.21727602 0.44507613 0.55907564
## [5,] 0.11560642 0.13599503 0.12837716
## [6,] -0.13106190 0.23643859 -0.09845040
## params has the estimated covariance matrix among precincts (and its inverse)
print(params@precinct_cov[1:6, 1:6])
## [,1] [,2] [,3] [,4] [,5] [,6]
## [1,] 0.05040144 0.03955334 0.04671025 0.03700208 0.02062539 0.02495767
## [2,] 0.03955334 0.05066582 0.04491882 0.03628276 0.01760325 0.02485196
## [3,] 0.04671025 0.04491882 0.07272259 0.04640707 0.03629336 0.02887542
## [4,] 0.03700208 0.03628276 0.04640707 0.04692306 0.02376431 0.02098243
## [5,] 0.02062539 0.01760325 0.03629336 0.02376431 0.04257291 0.01359855
## [6,] 0.02495767 0.02485196 0.02887542 0.02098243 0.01359855 0.03583655
I also provide some helper functions to make diagnostic plots. These require an sf object with the precinct shapefiles. The sf must have a column precinct which matches the id column in turnout_df.
The diagnostics include plots of (a) the fixed effects by precinct and by year, and (b) the svd components for the estimated covariances, along with each dimension’s score in each year. You should sanity check that the combination of precincts and elections make sense.
library(sf)
divs <- st_read(config$precinct_shp_path, quiet = TRUE, )
divs$precinct <- divs[[config$precinct_id_col]]
head(divs)
## Simple feature collection with 6 features and 6 fields
## geometry type: POLYGON
## dimension: XY
## bbox: xmin: 2670826 ymin: 239354.5 xmax: 2684800 ymax: 240827.4
## epsg (SRID): NA
## proj4string: +proj=lcc +lat_1=39.93333333333333 +lat_2=40.96666666666667 +lat_0=39.33333333333334 +lon_0=-77.75 +x_0=600000 +y_0=0 +datum=NAD83 +units=us-ft +no_defs
## YEAR WARD_DIVSN WARD DIVSN AREA_SFT precinct
## 1 2016 3403 34 3 629193 3403
## 2 2016 0605 6 5 1052544 0605
## 3 2016 0420 4 20 725059 0420
## 4 2016 0413 4 13 841331 0413
## 5 2016 0602 6 2 890778 0602
## 6 2016 2411 24 11 956738 2411
## geometry
## 1 POLYGON ((2671041.01971254 ...
## 2 POLYGON ((2681066.86908339 ...
## 3 POLYGON ((2672172.5558353 2...
## 4 POLYGON ((2672189.19031645 ...
## 5 POLYGON ((2680101.55113213 ...
## 6 POLYGON ((2684655.94097348 ...
diagnostics(params, divs)
## [1] "Plotting Diagnostics..."
## Press <Enter> to continue...
## Press <Enter> to continue...
## Press <Enter> to continue...
## Press <Enter> to continue...
## Press <Enter> to continue...
## Press <Enter> to continue...
## Press <Enter> to continue...
## Press <Enter> to continue...
The plots look good. Dimension 1 is blue for Hispanic North Philly and the University of Pennsylvania, and the line plot shows that these precincts had disproportionately high turnout in 2004, 2008, 2012, 2016 (the presidential elections). Dimension 2 has captured population change (red divisions are increasing, blue divisions are decreasing). Dimension 3 is hard to interpret, and may be noise/misfitting…
Let’s save the results and move on.
save_with_backup(params, stem="params", dir="outputs")
An important validation is to test the model on a fake, known distribution. The function load_data will either load data from our google-form download (later), or create a fake dataset with an S-curve.
source("../fit_submissions.R", chdir=TRUE)
data_list <- load_data(use_real_data=FALSE, params=params, election_config=config)
raw_data <- data_list$raw_data
fake_data <- data_list$fake_data
print("True Turnout to be estimated")
## [1] "True Turnout to be estimated"
fake_data$true_turnout
## [1] 455408.7
em_fit <- fit_em_model(
raw_data, params, verbose=FALSE, tol=1e-10, use_inverse=FALSE, election_config = config
)
## [1] "n_iter = 40"
fit <- process_results(
em_fit$precinct_re_fit,
em_fit$loess_fit,
raw_data,
em_fit$resid,
params,
election_config=config,
plots = TRUE,
save_results = FALSE,
fake_data=fake_data
)
## [1] "predicting loess"
## [1] "plots"
## Press <Enter> to continue...
## Press <Enter> to continue...
## Press <Enter> to continue...
## Press <Enter> to continue...
## [1] "div_turnout"
## [1] "time_df"
## [1] "full_predictions"
print("Estimate:")
## [1] "Estimate:"
fit$full_predictions %>%
filter(time_of_day == max(time_of_day)) %>%
with(sum(prediction))
## [1] 444393.8
But we don’t want a single estimate, we want a bootstrap of estimates. This can take a few minutes…:
source("../bootstrap.R", chdir=TRUE)
bs <- fit_bootstrap(
raw_data,
params,
election_config=config,
n_boot=100,
use_inverse=FALSE,
verbose=FALSE
)
## [1] "Raw Result"
## [1] "n_iter = 40"
## [1] "n_iter = 60"
## [1] "n_iter = 54"
## [1] "n_iter = 60"
## [1] "n_iter = 59"
## [1] "n_iter = 62"
## [1] "n_iter = 54"
## [1] "n_iter = 58"
## [1] "n_iter = 57"
## [1] "n_iter = 56"
## [1] "n_iter = 62"
## [1] "n_iter = 51"
## [1] "n_iter = 61"
## [1] "n_iter = 56"
## [1] "n_iter = 56"
## [1] "n_iter = 64"
## [1] "n_iter = 60"
## [1] "n_iter = 62"
## [1] "n_iter = 60"
## [1] "n_iter = 59"
## [1] "n_iter = 60"
## [1] "n_iter = 59"
## [1] "n_iter = 59"
## [1] "n_iter = 58"
## [1] "n_iter = 57"
## [1] "n_iter = 57"
## [1] "n_iter = 54"
## [1] "n_iter = 60"
## [1] "n_iter = 59"
## [1] "n_iter = 67"
## [1] "n_iter = 55"
## [1] "n_iter = 55"
## [1] "n_iter = 60"
## [1] "n_iter = 57"
## [1] "n_iter = 57"
## [1] "n_iter = 56"
## [1] "n_iter = 66"
## [1] "n_iter = 55"
## [1] "n_iter = 63"
## [1] "n_iter = 54"
## [1] "n_iter = 57"
## [1] "n_iter = 69"
## [1] "n_iter = 58"
## [1] "n_iter = 49"
## [1] "n_iter = 56"
## [1] "n_iter = 62"
## [1] "n_iter = 60"
## [1] "n_iter = 57"
## [1] "n_iter = 53"
## [1] "n_iter = 56"
## [1] "n_iter = 52"
## [1] "n_iter = 55"
## [1] "n_iter = 57"
## [1] "n_iter = 60"
## [1] "n_iter = 61"
## [1] "n_iter = 57"
## [1] "n_iter = 53"
## [1] "n_iter = 51"
## [1] "n_iter = 57"
## [1] "n_iter = 49"
## [1] "n_iter = 50"
## [1] "n_iter = 55"
## [1] "n_iter = 57"
## [1] "n_iter = 57"
## [1] "n_iter = 59"
## [1] "n_iter = 55"
## [1] "n_iter = 59"
## [1] "n_iter = 62"
## [1] "n_iter = 60"
## [1] "n_iter = 56"
## [1] "n_iter = 57"
## [1] "n_iter = 53"
## [1] "n_iter = 57"
## [1] "n_iter = 58"
## [1] "n_iter = 53"
## [1] "n_iter = 54"
## [1] "n_iter = 58"
## [1] "n_iter = 57"
## [1] "n_iter = 64"
## [1] "n_iter = 60"
## [1] "n_iter = 58"
## [1] "n_iter = 63"
## [1] "n_iter = 57"
## [1] "n_iter = 53"
## [1] "n_iter = 59"
## [1] "n_iter = 61"
## [1] "n_iter = 60"
## [1] "n_iter = 57"
## [1] "n_iter = 59"
## [1] "n_iter = 57"
## [1] "n_iter = 62"
## [1] "n_iter = 56"
## [1] "n_iter = 56"
## [1] "n_iter = 57"
## [1] "n_iter = 55"
## [1] "n_iter = 54"
## [1] "n_iter = 59"
## [1] "n_iter = 51"
## [1] "n_iter = 53"
## [1] "n_iter = 61"
## [1] "n_iter = 56"
## [1] "BS Turnout: 425748 448993 472533"
gg_bs_hist <- hist_bootstrap(bs)
print(gg_bs_hist)
gg_turnout <- turnout_plot(
bs,
raw_data,
fit,
election_config=config,
ref_turnout=c(300e3, 500e3),
ref_text=c("2014 turnout = 300,000", "2016 turnout = 500,000")
)
print(gg_turnout)
save_with_backup(
bs,
stem="bootstrap",
dir="outputs"
)
for(plotname in c("gg_turnout", "gg_bs_hist")){
ggsave_with_backup(
get(plotname),
filestem=plotname,
plottype="png",
width = 7,
height=7,
dir='outputs'
)
}